{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import pyltp\n",
    "import time\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 24.8 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "articles = pd.read_csv(\"articles.csv\",encoding=\"gbk\")\n",
    "LTP_DIR = \"./model/ltp_data_v3.4.0\"\n",
    "# print(articles)\n",
    "\n",
    "#分词模型\n",
    "segmentor = pyltp.Segmentor()\n",
    "segmentor.load(os.path.join(LTP_DIR, \"cws.model\"))\n",
    "\n",
    "#词性模型\n",
    "postagger = pyltp.Postagger()\n",
    "postagger.load(os.path.join(LTP_DIR, 'pos.model'))\n",
    "\n",
    "#命名实体模型\n",
    "recognizer = pyltp.NamedEntityRecognizer()\n",
    "recognizer.load(os.path.join(LTP_DIR, 'ner.model'))\n",
    "\n",
    "#依存句法分析\n",
    "parser = pyltp.Parser() \n",
    "parser.load(os.path.join(LTP_DIR, 'parser.model'))  \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 14.9 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# sentences(\n",
    "#  doc_id text,\n",
    "#  sentence_index int,\n",
    "#  sentence_text text,\n",
    "#  tokens text[],\n",
    "#  lemmas text[],\n",
    "#  pos_tags text[],\n",
    "#  ner_tags text[],\n",
    "#  doc_offsets int[],\n",
    "#  dep_types text[],\n",
    "#  dep_tokens int[]\n",
    "# ).\n",
    "\n",
    "sentences = []\n",
    "article_number = articles.shape[0]\n",
    "\n",
    "\n",
    "for article in range(article_number):\n",
    "    #分句\n",
    "    sents = pyltp.SentenceSplitter.split(articles.iloc[article]['text'])\n",
    "    \n",
    "#     token_offset = 0\n",
    "    for index,sent in enumerate(list(sents)):\n",
    "        #doc_id:文章id\n",
    "        sentence = str(articles.iloc[article]['id'])+\"&&&&\"\n",
    "        #sentence_index:句子id\n",
    "        sentence = sentence+str(index+1)+\"&&&&\"\n",
    "        #sentence_text\n",
    "        sentence = sentence+sent+\"&&&&\"\n",
    "        #tokens：分词\n",
    "        tokens = list(segmentor.segment(sent))\n",
    "        sentence = sentence+'@@'.join(tokens)+\"&&&&\"\n",
    "        \n",
    "        #lemmas：词元\n",
    "        \n",
    "        #pos_tags:词性\n",
    "        pos_tags = list(postagger.postag(tokens))\n",
    "        sentence = sentence+'@@'.join(pos_tags)+\"&&&&\"\n",
    "        #ner_tags:命名实体\n",
    "        ner_tags = list(recognizer.recognize(tokens, pos_tags))\n",
    "        sentence = sentence+'@@'.join(ner_tags)+\"&&&&\"\n",
    "        \n",
    "#         #doc_offsets:单词偏移量\n",
    "#         token_length = list(map(lambda x:len(x),tokens))\n",
    "#         token_length.insert(0, token_offset)\n",
    "#         token_length = np.array(token_length)\n",
    "#         doc_offset = token_length.cumsum(0)\n",
    "#         sentence.append(list(doc_offset[:-1]))\n",
    "#         token_offset = doc_offset[-1]\n",
    "\n",
    "        #dep_types，dep_tokens：依存句法\n",
    "        arcs = parser.parse(tokens, pos_tags)\n",
    "        arcs = np.array(list(map(lambda x: [x.head,x.relation],arcs)))\n",
    "        sentence = sentence+'@@'.join(list(arcs[:,1]))+\"&&&&\"\n",
    "        sentence = sentence+'@@'.join(list(arcs[:,0]))  \n",
    "        #一个和deepdive相同的完整的sentence的结构\n",
    "        \n",
    "        sentences.append(sentence)\n",
    "\n",
    "sentences = {'sentences_list':sentences}        \n",
    "df = pd.DataFrame(sentences,columns = ['sentences_list'])        \n",
    "df.to_csv(\"sentences_nlp.csv\",index=0,header=0,encoding=\"UTF-8\")\n",
    "        \n",
    "segmentor.release()\n",
    "postagger.release()\n",
    "recognizer.release()\n",
    "parser.release()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
